In [38]:
#importing libraries to perform EDA
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.gridspec as gridspec
import matplotlib.cm as cm
In [39]:
pip install datetime
Requirement already satisfied: datetime in c:\users\ramasish chatterjee\appdata\local\programs\python\python312\lib\site-packages (5.5) Requirement already satisfied: zope.interface in c:\users\ramasish chatterjee\appdata\local\programs\python\python312\lib\site-packages (from datetime) (6.4.post2) Requirement already satisfied: pytz in c:\users\ramasish chatterjee\appdata\local\programs\python\python312\lib\site-packages (from datetime) (2024.1) Requirement already satisfied: setuptools in c:\users\ramasish chatterjee\appdata\local\programs\python\python312\lib\site-packages (from zope.interface->datetime) (70.3.0) Note: you may need to restart the kernel to use updated packages.
[notice] A new release of pip is available: 24.1.2 -> 24.2 [notice] To update, run: python.exe -m pip install --upgrade pip
In [40]:
pip install statsmodels
Requirement already satisfied: statsmodels in c:\users\ramasish chatterjee\appdata\local\programs\python\python312\lib\site-packages (0.14.2) Requirement already satisfied: numpy>=1.22.3 in c:\users\ramasish chatterjee\appdata\local\programs\python\python312\lib\site-packages (from statsmodels) (2.0.0) Requirement already satisfied: scipy!=1.9.2,>=1.8 in c:\users\ramasish chatterjee\appdata\local\programs\python\python312\lib\site-packages (from statsmodels) (1.14.0) Requirement already satisfied: pandas!=2.1.0,>=1.4 in c:\users\ramasish chatterjee\appdata\local\programs\python\python312\lib\site-packages (from statsmodels) (2.2.2) Requirement already satisfied: patsy>=0.5.6 in c:\users\ramasish chatterjee\appdata\local\programs\python\python312\lib\site-packages (from statsmodels) (0.5.6) Requirement already satisfied: packaging>=21.3 in c:\users\ramasish chatterjee\appdata\local\programs\python\python312\lib\site-packages (from statsmodels) (23.2) Requirement already satisfied: python-dateutil>=2.8.2 in c:\users\ramasish chatterjee\appdata\local\programs\python\python312\lib\site-packages (from pandas!=2.1.0,>=1.4->statsmodels) (2.8.2) Requirement already satisfied: pytz>=2020.1 in c:\users\ramasish chatterjee\appdata\local\programs\python\python312\lib\site-packages (from pandas!=2.1.0,>=1.4->statsmodels) (2024.1) Requirement already satisfied: tzdata>=2022.7 in c:\users\ramasish chatterjee\appdata\local\programs\python\python312\lib\site-packages (from pandas!=2.1.0,>=1.4->statsmodels) (2024.1) Requirement already satisfied: six in c:\users\ramasish chatterjee\appdata\local\programs\python\python312\lib\site-packages (from patsy>=0.5.6->statsmodels) (1.16.0) Note: you may need to restart the kernel to use updated packages.
[notice] A new release of pip is available: 24.1.2 -> 24.2 [notice] To update, run: python.exe -m pip install --upgrade pip
In [41]:
pip install holidays
Requirement already satisfied: holidays in c:\users\ramasish chatterjee\appdata\local\programs\python\python312\lib\site-packages (0.52) Requirement already satisfied: python-dateutil in c:\users\ramasish chatterjee\appdata\local\programs\python\python312\lib\site-packages (from holidays) (2.8.2) Requirement already satisfied: six>=1.5 in c:\users\ramasish chatterjee\appdata\local\programs\python\python312\lib\site-packages (from python-dateutil->holidays) (1.16.0) Note: you may need to restart the kernel to use updated packages.
[notice] A new release of pip is available: 24.1.2 -> 24.2 [notice] To update, run: python.exe -m pip install --upgrade pip
In [42]:
df1 = pd.read_csv("C:/Users/Ramasish Chatterjee/Downloads/train(1).csv") #to read the csv file saved
In [43]:
df=df1
In [44]:
df.head() #to get top 06 values
Out[44]:
| datetime | season | holiday | workingday | weather | temp | atemp | humidity | windspeed | casual | registered | count | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2011-01-01 00:00:00 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 81 | 0.0 | 3 | 13 | 16 |
| 1 | 2011-01-01 01:00:00 | 1 | 0 | 0 | 1 | 9.02 | 13.635 | 80 | 0.0 | 8 | 32 | 40 |
| 2 | 2011-01-01 02:00:00 | 1 | 0 | 0 | 1 | 9.02 | 13.635 | 80 | 0.0 | 5 | 27 | 32 |
| 3 | 2011-01-01 03:00:00 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 75 | 0.0 | 3 | 10 | 13 |
| 4 | 2011-01-01 04:00:00 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 75 | 0.0 | 0 | 1 | 1 |
In [45]:
df.shape
Out[45]:
(10886, 12)
In [46]:
df.columns
Out[46]:
Index(['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp',
'atemp', 'humidity', 'windspeed', 'casual', 'registered', 'count'],
dtype='object')
In [47]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10886 entries, 0 to 10885 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 datetime 10886 non-null object 1 season 10886 non-null int64 2 holiday 10886 non-null int64 3 workingday 10886 non-null int64 4 weather 10886 non-null int64 5 temp 10886 non-null float64 6 atemp 10886 non-null float64 7 humidity 10886 non-null int64 8 windspeed 10886 non-null float64 9 casual 10886 non-null int64 10 registered 10886 non-null int64 11 count 10886 non-null int64 dtypes: float64(3), int64(8), object(1) memory usage: 1020.7+ KB
In [48]:
df.head()
Out[48]:
| datetime | season | holiday | workingday | weather | temp | atemp | humidity | windspeed | casual | registered | count | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2011-01-01 00:00:00 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 81 | 0.0 | 3 | 13 | 16 |
| 1 | 2011-01-01 01:00:00 | 1 | 0 | 0 | 1 | 9.02 | 13.635 | 80 | 0.0 | 8 | 32 | 40 |
| 2 | 2011-01-01 02:00:00 | 1 | 0 | 0 | 1 | 9.02 | 13.635 | 80 | 0.0 | 5 | 27 | 32 |
| 3 | 2011-01-01 03:00:00 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 75 | 0.0 | 3 | 10 | 13 |
| 4 | 2011-01-01 04:00:00 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 75 | 0.0 | 0 | 1 | 1 |
In [49]:
df.tail()
Out[49]:
| datetime | season | holiday | workingday | weather | temp | atemp | humidity | windspeed | casual | registered | count | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 10881 | 2012-12-19 19:00:00 | 4 | 0 | 1 | 1 | 15.58 | 19.695 | 50 | 26.0027 | 7 | 329 | 336 |
| 10882 | 2012-12-19 20:00:00 | 4 | 0 | 1 | 1 | 14.76 | 17.425 | 57 | 15.0013 | 10 | 231 | 241 |
| 10883 | 2012-12-19 21:00:00 | 4 | 0 | 1 | 1 | 13.94 | 15.910 | 61 | 15.0013 | 4 | 164 | 168 |
| 10884 | 2012-12-19 22:00:00 | 4 | 0 | 1 | 1 | 13.94 | 17.425 | 61 | 6.0032 | 12 | 117 | 129 |
| 10885 | 2012-12-19 23:00:00 | 4 | 0 | 1 | 1 | 13.12 | 16.665 | 66 | 8.9981 | 4 | 84 | 88 |
In [50]:
df.describe(include='all')
Out[50]:
| datetime | season | holiday | workingday | weather | temp | atemp | humidity | windspeed | casual | registered | count | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 10886 | 10886.000000 | 10886.000000 | 10886.000000 | 10886.000000 | 10886.00000 | 10886.000000 | 10886.000000 | 10886.000000 | 10886.000000 | 10886.000000 | 10886.000000 |
| unique | 10886 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| top | 2012-12-19 23:00:00 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| freq | 1 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| mean | NaN | 2.506614 | 0.028569 | 0.680875 | 1.418427 | 20.23086 | 23.655084 | 61.886460 | 12.799395 | 36.021955 | 155.552177 | 191.574132 |
| std | NaN | 1.116174 | 0.166599 | 0.466159 | 0.633839 | 7.79159 | 8.474601 | 19.245033 | 8.164537 | 49.960477 | 151.039033 | 181.144454 |
| min | NaN | 1.000000 | 0.000000 | 0.000000 | 1.000000 | 0.82000 | 0.760000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| 25% | NaN | 2.000000 | 0.000000 | 0.000000 | 1.000000 | 13.94000 | 16.665000 | 47.000000 | 7.001500 | 4.000000 | 36.000000 | 42.000000 |
| 50% | NaN | 3.000000 | 0.000000 | 1.000000 | 1.000000 | 20.50000 | 24.240000 | 62.000000 | 12.998000 | 17.000000 | 118.000000 | 145.000000 |
| 75% | NaN | 4.000000 | 0.000000 | 1.000000 | 2.000000 | 26.24000 | 31.060000 | 77.000000 | 16.997900 | 49.000000 | 222.000000 | 284.000000 |
| max | NaN | 4.000000 | 1.000000 | 1.000000 | 4.000000 | 41.00000 | 45.455000 | 100.000000 | 56.996900 | 367.000000 | 886.000000 | 977.000000 |
DATA CLEANING¶
In [51]:
df.duplicated().any()
Out[51]:
np.False_
In [52]:
parts = df['datetime'].str.split(' ', n=2, expand=True)
df['date'] = parts[0]
df['time'] = parts[1].str[:2].astype('int')
parts = df['date'].str.split('-', n=2, expand=True)
df['year'] = parts[0].astype('int')
df['month'] = parts[1].astype('int')
df['day'] = parts[2].astype('int')
In [53]:
from datetime import date
import holidays
def is_holiday(x):
india_holidays = holidays.country_holidays('IN')
if india_holidays.get(x):
return 1
else:
return 0
df['holidays'] = df['date'].apply(is_holiday)
df.head()
In [17]:
# Check the columns in the DataFrame
print(df.columns)
# Drop only the columns that exist in the DataFrame
columns_to_drop = ['datetime', 'holiday', 'workingday', 'date']
existing_columns_to_drop = [col for col in columns_to_drop if col in df.columns]
# Drop the existing columns
df.drop(columns=existing_columns_to_drop, axis=1, inplace=True)
Index(['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp',
'atemp', 'humidity', 'windspeed', 'casual', 'registered', 'count',
'date', 'time', 'year', 'month', 'day'],
dtype='object')
In [54]:
df.head()
Out[54]:
| datetime | season | holiday | workingday | weather | temp | atemp | humidity | windspeed | casual | registered | count | date | time | year | month | day | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2011-01-01 00:00:00 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 81 | 0.0 | 3 | 13 | 16 | 2011-01-01 | 0 | 2011 | 1 | 1 |
| 1 | 2011-01-01 01:00:00 | 1 | 0 | 0 | 1 | 9.02 | 13.635 | 80 | 0.0 | 8 | 32 | 40 | 2011-01-01 | 1 | 2011 | 1 | 1 |
| 2 | 2011-01-01 02:00:00 | 1 | 0 | 0 | 1 | 9.02 | 13.635 | 80 | 0.0 | 5 | 27 | 32 | 2011-01-01 | 2 | 2011 | 1 | 1 |
| 3 | 2011-01-01 03:00:00 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 75 | 0.0 | 3 | 10 | 13 | 2011-01-01 | 3 | 2011 | 1 | 1 |
| 4 | 2011-01-01 04:00:00 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 75 | 0.0 | 0 | 1 | 1 | 2011-01-01 | 4 | 2011 | 1 | 1 |
In [19]:
df['time'].unique()
Out[19]:
array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23])
In [20]:
df.isnull().sum()
Out[20]:
season 0 weather 0 temp 0 atemp 0 humidity 0 windspeed 0 casual 0 registered 0 count 0 time 0 year 0 month 0 day 0 dtype: int64
VISUAL DATA EXPLORATORY¶
In [21]:
numerical = df.drop(columns=["season","weather","time","day","month","year"],axis = 1, )
categorical = df[["season","weather","time","day","month","year"]]
numerical.head()
Out[21]:
| temp | atemp | humidity | windspeed | casual | registered | count | |
|---|---|---|---|---|---|---|---|
| 0 | 9.84 | 14.395 | 81 | 0.0 | 3 | 13 | 16 |
| 1 | 9.02 | 13.635 | 80 | 0.0 | 8 | 32 | 40 |
| 2 | 9.02 | 13.635 | 80 | 0.0 | 5 | 27 | 32 |
| 3 | 9.84 | 14.395 | 75 | 0.0 | 3 | 10 | 13 |
| 4 | 9.84 | 14.395 | 75 | 0.0 | 0 | 1 | 1 |
In [22]:
sns.countplot(x='season', data=df)
Out[22]:
<Axes: xlabel='season', ylabel='count'>
In [23]:
df['season'].value_counts()
Out[23]:
season 4 2734 2 2733 3 2733 1 2686 Name: count, dtype: int64
In [24]:
sns.countplot(x='weather', data=df)
Out[24]:
<Axes: xlabel='weather', ylabel='count'>
In [25]:
df['weather'].value_counts()
Out[25]:
weather 1 7192 2 2834 3 859 4 1 Name: count, dtype: int64
In [26]:
sns.countplot(x='time', data=df)
Out[26]:
<Axes: xlabel='time', ylabel='count'>
In [27]:
df['time'].value_counts()
Out[27]:
time 16 456 17 456 18 456 19 456 12 456 13 456 14 456 15 456 20 456 21 456 22 456 23 456 8 455 6 455 7 455 0 455 10 455 9 455 11 455 1 454 5 452 2 448 4 442 3 433 Name: count, dtype: int64
In [28]:
sns.countplot(x='day', data=df)
Out[28]:
<Axes: xlabel='day', ylabel='count'>
In [29]:
df['day'].value_counts()
Out[29]:
day 1 575 9 575 5 575 17 575 16 574 19 574 7 574 8 574 15 574 13 574 4 574 14 574 2 573 3 573 12 573 6 572 10 572 11 568 18 563 Name: count, dtype: int64
In [30]:
sns.countplot(x='month', data=df)
Out[30]:
<Axes: xlabel='month', ylabel='count'>
In [31]:
sns.countplot(x='year', data=df)
Out[31]:
<Axes: xlabel='year', ylabel='count'>
In [32]:
sns.displot(x='temp', data=df)
Out[32]:
<seaborn.axisgrid.FacetGrid at 0x218be9d94c0>
In [33]:
sns.displot(x='atemp', data=df)
Out[33]:
<seaborn.axisgrid.FacetGrid at 0x218be7a8ec0>
In [34]:
sns.displot(x='humidity', data=df)
Out[34]:
<seaborn.axisgrid.FacetGrid at 0x218be7a9af0>
In [35]:
sns.displot(x='windspeed', data=df)
Out[35]:
<seaborn.axisgrid.FacetGrid at 0x218be962cc0>
In [36]:
df['windspeed'].value_counts()
Out[36]:
windspeed 0.0000 1313 8.9981 1120 11.0014 1057 12.9980 1042 7.0015 1034 15.0013 961 6.0032 872 16.9979 824 19.0012 676 19.9995 492 22.0028 372 23.9994 274 26.0027 235 27.9993 187 30.0026 111 31.0009 89 32.9975 80 35.0008 58 39.0007 27 36.9974 22 43.0006 12 40.9973 11 43.9989 8 46.0022 3 47.9988 2 56.9969 2 51.9987 1 50.0021 1 Name: count, dtype: int64
In [37]:
sns.displot(x='casual', data=df)
Out[37]:
<seaborn.axisgrid.FacetGrid at 0x218be8ebb60>
In [38]:
df['casual'].value_counts()
Out[38]:
casual
0 986
1 667
2 487
3 438
4 354
...
294 1
280 1
216 1
292 1
304 1
Name: count, Length: 309, dtype: int64
In [39]:
sns.displot(x='registered', data=df)
Out[39]:
<seaborn.axisgrid.FacetGrid at 0x218be7a3e60>
In [40]:
df['registered'].value_counts()
Out[40]:
registered
3 195
4 190
5 177
6 155
2 150
...
768 1
666 1
690 1
693 1
761 1
Name: count, Length: 731, dtype: int64
In [41]:
sns.displot(x='count', data=df, kde=True)
Out[41]:
<seaborn.axisgrid.FacetGrid at 0x218beb8a4e0>
In [42]:
df['count'].value_counts()
Out[42]:
count
5 169
4 149
3 144
6 135
2 132
...
819 1
830 1
825 1
688 1
636 1
Name: count, Length: 822, dtype: int64
In [24]:
fig, ax = plt.subplots(figsize=(20,10))
sns.pointplot(data=df, x='time', y='count')
Out[24]:
<Axes: xlabel='time', ylabel='count'>
In [29]:
fig, ax = plt.subplots(figsize=(20,10))
sns.pointplot(data=df, x='time', y='casual')
Out[29]:
<Axes: xlabel='time', ylabel='casual'>
In [37]:
fig, ax = plt.subplots(figsize=(20,10))
sns.pointplot(data=df, x='time', y='registered')
Out[37]:
<Axes: xlabel='time', ylabel='registered'>
In [31]:
fig, ax = plt.subplots(figsize=(20,10))
sns.pointplot(data=df, x='time', y='count', hue='weather')
Out[31]:
<Axes: xlabel='time', ylabel='count'>
In [ ]:
In [33]:
fig, ax = plt.subplots(figsize=(20,10))
sns.pointplot(data=df, x='time', y='count', hue='season')
Out[33]:
<Axes: xlabel='time', ylabel='count'>
In [55]:
fig, ax = plt.subplots(figsize=(20,10))
sns.pointplot(data=df, x='time', y='count', hue='workingday')
Out[55]:
<Axes: xlabel='time', ylabel='count'>
In [56]:
fig, ax = plt.subplots(figsize=(20,10))
sns.pointplot(data=df, x='time', y='count', hue='holiday')
Out[56]:
<Axes: xlabel='time', ylabel='count'>
In [36]:
fig, ax = plt.subplots(figsize=(20,10))
sns.barplot(data=df, x='month', y='count')
Out[36]:
<Axes: xlabel='month', ylabel='count'>
In [57]:
fig, ax = plt.subplots(figsize=(20,10))
sns.barplot(data=df, x='season', y='count')
Out[57]:
<Axes: xlabel='season', ylabel='count'>
In [58]:
fig, ax = plt.subplots(figsize=(20,10))
sns.barplot(data=df, x='weather', y='count')
Out[58]:
<Axes: xlabel='weather', ylabel='count'>
In [60]:
fig, ax = plt.subplots(figsize=(20,10))
sns.pointplot(data=df, x='registered', y='count')
Out[60]:
<Axes: xlabel='registered', ylabel='count'>
In [61]:
fig, ax = plt.subplots(figsize=(20,10))
sns.pointplot(data=df, x='casual', y='count')
Out[61]:
<Axes: xlabel='casual', ylabel='count'>
In [62]:
fig, ax = plt.subplots(figsize=(20,10))
sns.pointplot(data=df, x='temp', y='count')
Out[62]:
<Axes: xlabel='temp', ylabel='count'>
In [63]:
fig, ax = plt.subplots(figsize=(20,10))
sns.pointplot(data=df, x='windspeed', y='count')
Out[63]:
<Axes: xlabel='windspeed', ylabel='count'>
In [67]:
fig, ax = plt.subplots(figsize=(20,10))
sns.pointplot(data=df, x='humidity', y='count')
Out[67]:
<Axes: xlabel='humidity', ylabel='count'>
In [68]:
fig, ax = plt.subplots(figsize=(20,10))
sns.pointplot(data=df, x='year', y='count')
Out[68]:
<Axes: xlabel='year', ylabel='count'>
In [43]:
sns.boxplot(x='temp',data=df)
Out[43]:
<Axes: xlabel='temp'>
In [44]:
sns.boxplot(x='atemp',data=df)
Out[44]:
<Axes: xlabel='atemp'>
In [45]:
sns.boxplot(x='humidity',data=df)
Out[45]:
<Axes: xlabel='humidity'>
In [46]:
sns.boxplot(x='windspeed',data=df)
Out[46]:
<Axes: xlabel='windspeed'>
In [47]:
sns.boxplot(x='casual',data=df)
Out[47]:
<Axes: xlabel='casual'>
In [48]:
sns.boxplot(x='registered',data=df)
Out[48]:
<Axes: xlabel='registered'>
In [49]:
sns.boxplot(x='count',data=df)
Out[49]:
<Axes: xlabel='count'>
In [50]:
#treating outliers
outliers=df[df['windspeed']>32]
outliers=df[df['humidity']<10]
outliers.shape
Out[50]:
(23, 13)
In [51]:
df=df[(df['windspeed']<32) & (df['humidity']>10)]
df.shape
Out[51]:
(10636, 13)
In [52]:
corr = df.corr()
In [53]:
corr
Out[53]:
| season | weather | temp | atemp | humidity | windspeed | casual | registered | count | time | year | month | day | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| season | 1.000000 | 0.008688 | 0.251069 | 0.255650 | 0.170627 | -0.117854 | 0.089204 | 0.160919 | 0.158798 | -0.002001 | -0.007863 | 0.971383 | 0.002861 |
| weather | 0.008688 | 1.000000 | -0.057852 | -0.058865 | 0.426167 | -0.003800 | -0.134406 | -0.107356 | -0.126598 | -0.022993 | -0.006757 | 0.010762 | -0.008303 |
| temp | 0.251069 | -0.057852 | 1.000000 | 0.985054 | -0.077164 | -0.003356 | 0.464794 | 0.315089 | 0.390958 | 0.148215 | 0.062043 | 0.250072 | 0.015292 |
| atemp | 0.255650 | -0.058865 | 0.985054 | 1.000000 | -0.056812 | -0.041263 | 0.459791 | 0.311121 | 0.386268 | 0.144198 | 0.059383 | 0.255141 | 0.010752 |
| humidity | 0.170627 | 0.426167 | -0.077164 | -0.056812 | 1.000000 | -0.303318 | -0.363920 | -0.280266 | -0.334095 | -0.279055 | -0.089708 | 0.187304 | -0.010524 |
| windspeed | -0.117854 | -0.003800 | -0.003356 | -0.041263 | -0.303318 | 1.000000 | 0.104181 | 0.103729 | 0.115236 | 0.142263 | -0.010875 | -0.120907 | 0.035901 |
| casual | 0.089204 | -0.134406 | 0.464794 | 0.459791 | -0.363920 | 0.104181 | 1.000000 | 0.496717 | 0.690039 | 0.306292 | 0.144988 | 0.085978 | 0.015727 |
| registered | 0.160919 | -0.107356 | 0.315089 | 0.311121 | -0.280266 | 0.103729 | 0.496717 | 1.000000 | 0.970925 | 0.384322 | 0.261156 | 0.166388 | 0.021157 |
| count | 0.158798 | -0.126598 | 0.390958 | 0.386268 | -0.334095 | 0.115236 | 0.690039 | 0.970925 | 1.000000 | 0.404975 | 0.257774 | 0.162469 | 0.021981 |
| time | -0.002001 | -0.022993 | 0.148215 | 0.144198 | -0.279055 | 0.142263 | 0.306292 | 0.384322 | 0.404975 | 1.000000 | -0.002148 | -0.002486 | 0.003051 |
| year | -0.007863 | -0.006757 | 0.062043 | 0.059383 | -0.089708 | -0.010875 | 0.144988 | 0.261156 | 0.257774 | -0.002148 | 1.000000 | -0.007670 | 0.003366 |
| month | 0.971383 | 0.010762 | 0.250072 | 0.255141 | 0.187304 | -0.120907 | 0.085978 | 0.166388 | 0.162469 | -0.002486 | -0.007670 | 1.000000 | 0.003605 |
| day | 0.002861 | -0.008303 | 0.015292 | 0.010752 | -0.010524 | 0.035901 | 0.015727 | 0.021157 | 0.021981 | 0.003051 | 0.003366 | 0.003605 | 1.000000 |
In [54]:
corrmat =df.corr()
plt.figure(figsize=(10,10))
sns.heatmap(corrmat, annot=True,cmap='coolwarm',center=0)
plt.show()
In [55]:
sns.pairplot(df)
Out[55]:
<seaborn.axisgrid.PairGrid at 0x218be88cc20>
In [56]:
#treatment of multicollinearity
df = df.drop(['temp','registered','month'],axis=1)
In [57]:
df.shape
Out[57]:
(10636, 10)
MODELLING¶
In [58]:
pip install scikit-learn
Requirement already satisfied: scikit-learn in c:\users\ramasish chatterjee\appdata\local\programs\python\python312\lib\site-packages (1.5.1) Requirement already satisfied: numpy>=1.19.5 in c:\users\ramasish chatterjee\appdata\local\programs\python\python312\lib\site-packages (from scikit-learn) (2.0.0) Requirement already satisfied: scipy>=1.6.0 in c:\users\ramasish chatterjee\appdata\local\programs\python\python312\lib\site-packages (from scikit-learn) (1.14.0) Requirement already satisfied: joblib>=1.2.0 in c:\users\ramasish chatterjee\appdata\local\programs\python\python312\lib\site-packages (from scikit-learn) (1.4.2) Requirement already satisfied: threadpoolctl>=3.1.0 in c:\users\ramasish chatterjee\appdata\local\programs\python\python312\lib\site-packages (from scikit-learn) (3.5.0) Note: you may need to restart the kernel to use updated packages.
Applying OLS and checking CLRM assumptions for linear regression¶
In [59]:
import pandas as pd
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.stats.diagnostic import het_breuschpagan
from statsmodels.stats.stattools import durbin_watson
# Separate features and target variable
x = df.drop(columns=['count'], axis=1)
x = sm.add_constant(x)
y = df['count'].values
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=22)
# Fit the linear regression model
lr = sm.OLS(y_train, X_train).fit()
lr.params
lr.summary()
Out[59]:
| Dep. Variable: | y | R-squared: | 0.568 |
|---|---|---|---|
| Model: | OLS | Adj. R-squared: | 0.568 |
| Method: | Least Squares | F-statistic: | 1399. |
| Date: | Wed, 17 Jul 2024 | Prob (F-statistic): | 0.00 |
| Time: | 23:02:27 | Log-Likelihood: | -59299. |
| No. Observations: | 9572 | AIC: | 1.186e+05 |
| Df Residuals: | 9562 | BIC: | 1.187e+05 |
| Df Model: | 9 | ||
| Covariance Type: | nonrobust |
| coef | std err | t | P>|t| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| const | -1.267e+05 | 4953.191 | -25.576 | 0.000 | -1.36e+05 | -1.17e+05 |
| season | 18.0748 | 1.158 | 15.611 | 0.000 | 15.805 | 20.344 |
| weather | -5.8349 | 2.171 | -2.688 | 0.007 | -10.090 | -1.580 |
| atemp | 1.5072 | 0.168 | 8.967 | 0.000 | 1.178 | 1.837 |
| humidity | -0.6140 | 0.083 | -7.377 | 0.000 | -0.777 | -0.451 |
| windspeed | 0.7209 | 0.174 | 4.137 | 0.000 | 0.379 | 1.062 |
| casual | 1.9242 | 0.030 | 63.177 | 0.000 | 1.864 | 1.984 |
| time | 5.4440 | 0.189 | 28.858 | 0.000 | 5.074 | 5.814 |
| year | 62.9861 | 2.462 | 25.580 | 0.000 | 58.160 | 67.813 |
| day | 0.3296 | 0.221 | 1.491 | 0.136 | -0.104 | 0.763 |
| Omnibus: | 3805.039 | Durbin-Watson: | 1.997 |
|---|---|---|---|
| Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 15045.805 |
| Skew: | 1.984 | Prob(JB): | 0.00 |
| Kurtosis: | 7.688 | Cond. No. | 8.22e+06 |
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 8.22e+06. This might indicate that there are
strong multicollinearity or other numerical problems.
In [60]:
#checking VIF for multicollinearity
vif_data = pd.DataFrame()
vif_data["feature"] = x.columns
vif_data["VIF"] = [variance_inflation_factor(x.values, i) for i in range(x.shape[1])]
print(vif_data)
feature VIF 0 const 1.667440e+07 1 season 1.120878e+00 2 weather 1.272024e+00 3 atemp 1.364249e+00 4 humidity 1.684794e+00 5 windspeed 1.138664e+00 6 casual 1.564723e+00 7 time 1.167136e+00 8 year 1.030064e+00 9 day 1.001633e+00
In [61]:
#Checking heteroscedasticity
from statsmodels.stats.diagnostic import het_goldfeldquandt
from sklearn.linear_model import LinearRegression
residuals = y_train - lr.predict(X_train)
#sorted residuals
sorted_residuals = np.sort(residuals)
#Find the split point for the subsets
split_index = int(len(sorted_residuals) * 0.5)
#Split the data into two subsets based on residuals
lower_half = sorted_residuals[:split_index]
upper_half = sorted_residuals[split_index:]
#Calculate the Goldfeld-Quandt test statistic and p-value
gq_test_statistic, gq_p_value, _ = het_goldfeldquandt(y_train, X_train)
print("Goldfeld-Quandt test Statistic:", gq_test_statistic)
print("Goldfeld-Quandt Test p-value:", gq_p_value)
Goldfeld-Quandt test Statistic: 0.9086831174161009 Goldfeld-Quandt Test p-value: 0.9995303028283935
FEATURE SCALING¶
In [62]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
In [63]:
from sklearn import metrics
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
ML MODELS¶
Linear Regression¶
In [64]:
from sklearn.linear_model import LinearRegression
lnr = LinearRegression()
lnr_model = lnr.fit(X_train_scaled, y_train)
lnr_pred = lnr.predict(X_test_scaled)
linear_r2 = r2_score(y_test, lnr_pred)
print("Linear Regression R2:", linear_r2)
linear_rmse = mean_squared_error(y_test, lnr_pred, squared=False)
print(linear_rmse)
Linear Regression R2: 0.5546065483357989 124.57231746528659
C:\Users\Ramasish Chatterjee\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\metrics\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'. warnings.warn(
Decision Tree Regressor¶
In [65]:
from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor(random_state=42)
dt_model = dt.fit(X_train_scaled, y_train)
dt_pred = dt.predict(X_test_scaled)
tree_r2 = r2_score(y_test, dt_pred)
print("Decision Tree R2:", tree_r2)
tree_rmse = mean_squared_error(y_test, dt_pred, squared=False)
print(tree_rmse)
Decision Tree R2: 0.7442019049886539 94.4057143915263
C:\Users\Ramasish Chatterjee\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\metrics\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'. warnings.warn(
Random Forest Regressor¶
In [66]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(random_state=42)
rfr_model = rfr.fit(X_train_scaled, y_train)
rfr_pred = rfr.predict(X_test_scaled)
forest_r2 = r2_score(y_test, rfr_pred)
print("Random Forest R2:", forest_r2)
forest_rmse = mean_squared_error(y_test, rfr_pred, squared=False)
print(forest_rmse)
Random Forest R2: 0.8741353950924678 66.22186971657776
C:\Users\Ramasish Chatterjee\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\metrics\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'. warnings.warn(
HYPERPARAMETER TUNING¶
In [67]:
from sklearn.model_selection import GridSearchCV
In [68]:
dt_params={'max_depth':[None,10,20,30],'min_samples_split':[2,5,10],'min_samples_split':[2,5,10]}
dt_grid=GridSearchCV(dt,dt_params,cv=5)
dt_grid.fit(X_train_scaled,y_train)
Out[68]:
GridSearchCV(cv=5, estimator=DecisionTreeRegressor(random_state=42),
param_grid={'max_depth': [None, 10, 20, 30],
'min_samples_split': [2, 5, 10]})In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(cv=5, estimator=DecisionTreeRegressor(random_state=42),
param_grid={'max_depth': [None, 10, 20, 30],
'min_samples_split': [2, 5, 10]})DecisionTreeRegressor(max_depth=10, min_samples_split=10, random_state=42)
DecisionTreeRegressor(max_depth=10, min_samples_split=10, random_state=42)
In [69]:
dt_result=pd.DataFrame(dt_grid.cv_results_)
best_parameters_dt=dt_grid.best_params_
print(best_parameters_dt,dt_grid.best_score_)
{'max_depth': 10, 'min_samples_split': 10} 0.8129806416327222
In [70]:
ran_forest_params={'n_estimators':[50,100,200],'max_depth':[None,10,20,30],'min_samples_split':[2,5,10],'min_samples_split':[2,5,10]}
ran_forest_grid=GridSearchCV(rfr,ran_forest_params,cv=5)
ran_forest_grid.fit(X_train_scaled,y_train)
Out[70]:
GridSearchCV(cv=5, estimator=RandomForestRegressor(random_state=42),
param_grid={'max_depth': [None, 10, 20, 30],
'min_samples_split': [2, 5, 10],
'n_estimators': [50, 100, 200]})In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(cv=5, estimator=RandomForestRegressor(random_state=42),
param_grid={'max_depth': [None, 10, 20, 30],
'min_samples_split': [2, 5, 10],
'n_estimators': [50, 100, 200]})RandomForestRegressor(max_depth=20, n_estimators=200, random_state=42)
RandomForestRegressor(max_depth=20, n_estimators=200, random_state=42)
In [71]:
ran_forest_result=pd.DataFrame(ran_forest_grid.cv_results_)
best_parameters_ran_forest=ran_forest_grid.best_params_
print(best_parameters_ran_forest,ran_forest_grid.best_score_)
{'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 200} 0.8658644179383306
Final Models¶
In [72]:
from sklearn.metrics import roc_curve
In [73]:
lnr = LinearRegression(fit_intercept=True, copy_X=True, n_jobs=None)
lnr_model = lnr.fit(X_train_scaled, y_train)
lnr_pred = lnr.predict(X_test_scaled)
linear_r2 = r2_score(y_test, lnr_pred)
print("Linear Regression R2:", linear_r2)
linear_rmse = mean_squared_error(y_test, lnr_pred, squared=False)
print(linear_rmse)
Linear Regression R2: 0.5546065483357989 124.57231746528659
C:\Users\Ramasish Chatterjee\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\metrics\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'. warnings.warn(
In [74]:
dtf=DecisionTreeRegressor(max_depth= 10, min_samples_split= 10)
dtf.fit(X_train_scaled,y_train)
y_pred_dtf=dtf.predict(X_test_scaled)
tree_r2 = r2_score(y_test, y_pred_dtf)
print("Decision Tree R2:", tree_r2)
tree_rmse = mean_squared_error(y_test, y_pred_dtf, squared=False)
print(tree_rmse)
Decision Tree R2: 0.8189796347652459 79.41697641736782
C:\Users\Ramasish Chatterjee\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\metrics\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'. warnings.warn(
In [75]:
ran_forestf=RandomForestRegressor(n_estimators = 200, max_depth = 20, min_samples_split = 2)
ran_forestf.fit(X_train_scaled,y_train)
y_pred_ran_forest_f=rfr.predict(X_test_scaled)
forest_r2 = r2_score(y_test, y_pred_ran_forest_f)
print("Random Forest R2:", forest_r2)
forest_rmse = mean_squared_error(y_test, y_pred_ran_forest_f, squared=False)
print(forest_rmse)
Random Forest R2: 0.8741353950924678 66.22186971657776
C:\Users\Ramasish Chatterjee\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\metrics\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'. warnings.warn(
Feature Importance¶
In [76]:
# Get the feature importances
importances = ran_forestf.feature_importances_
# Convert X_train_scaled back to DataFrame to get column names
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)
# Map the feature importances to their corresponding feature names
feature_importance_df = pd.DataFrame({
'Feature': X_train_scaled_df.columns,
'Importance': importances
}).sort_values(by='Importance', ascending=False)
# Display the feature importances
print(feature_importance_df)
# Plot the feature importances
plt.figure(figsize=(10, 8))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df)
plt.title('Feature Importances from Random Forest Regressor')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()
Feature Importance 6 casual 0.595181 7 time 0.231478 8 year 0.045469 3 atemp 0.036505 4 humidity 0.030343 1 season 0.018919 9 day 0.018767 5 windspeed 0.016948 2 weather 0.006391 0 const 0.000000